# Alex F. Gazmararian
# agazmararian@gmail.com

library(tidyverse)
library(here)
library(tidylog)

#' Merge Annotated Statements with Covariates
#'
#' This function merges annotated statements with various covariate datasets including
#' political, economic, demographic, and geographic data.
#'
#' @param annotations_file Character. Path to the annotated statements CSV file. 
#'   Default: "data/inter/annotated_statements.csv"
#' @param output_file Character. Path for the merged output CSV file. 
#'   Default: "data/output/statements_analysis.csv"
#' @param project_year_cutoff Numeric. Latest year to include projects from. Default: 2024
#' @param ira_date Date. IRA enactment date for filtering. Default: "2022-08-16"
#' @param verbose Logical. Whether to print progress messages. Default: TRUE
#'
#' @return A data frame with merged statements and covariates, also saved to output_file
#'
#' @examples
#' # Use default primary annotations
#' df <- merge_statements_covariates()
#' 
#' # Use robustness check annotations  
#' df_robust <- merge_statements_covariates(
#'   annotations_file = "data/inter/annotated_statements_strict_all.csv",
#'   output_file = "data/output/statements_analysis_strict.csv"
#' )
merge_statements_covariates <- function(
    annotations_file = here("data", "inter", "annotated_statements.csv"),
    output_file = here("data", "output", "statements_analysis.csv"),
    project_year_cutoff = 2024,
    ira_date = as.Date("2022-08-16"),
    verbose = TRUE) {

# Load data----
if (verbose) message("=== LOADING DATA ===")
  annotations <- read_csv(annotations_file, show_col_types = FALSE)
  statements <- read_csv(here("data", "inter", "statements_processed.csv"), show_col_types = FALSE)
  govparty <- read_csv(here("data", "inter", "governors_processed.csv"), show_col_types = FALSE)
  senators_df <- read_csv(here("data", "inter", "senators_processed.csv"), show_col_types = FALSE)
  house_df <- read_csv(here("data", "inter", "house_processed.csv"), show_col_types = FALSE)
  turner <- read_csv(here("data", "inter", "turner_processed.csv"), show_col_types = FALSE)
  acs <- read_csv(here("data", "inter", "acs2023_county_processed.csv"))
  interstate <- read_csv(here("data", "inter", "highway_counties_processed.csv"))
  broadband <- read_csv(here("data", "inter", "broadband_county_processed.csv"))
  bea <- read_csv(here("data", "inter", "bea_county_processed.csv"))
  pres <- read_csv(here("data", "inter", "pres_elections_2020.csv"))
  eprice <- read_csv(here("data", "inter", "industrial_electricity_prices_state.csv"))
  emp <- read_csv(here("data", "inter", "emp_county_processed.csv"))
  union <- read_csv(here("data", "inter", "union_state_processed.csv"))
  cong_elec <- read_csv(here("data", "inter", "congressional_elections.csv"))

  # Merge annotations and statement meta-data----
  if (verbose) message("=== MERGE ANNOTATIONS AND STATEMENT META-DATA ===")
  # Check for un-annotated statements
  no.annotation <- anti_join(statements, annotations, by = "statement_id") %>%
    select(statement)
  per.empty <- sum(is.na(no.annotation$statement)) / nrow(no.annotation)
  if (per.empty != 1) {
      if (verbose) message(sprintf("Of %d un-annotated statements, %.2f%% are for empty statements (should be 100)", nrow(no.annotation), per.empty * 100))
      stop("Some statements are un-annotated. Please check the data.")
  }
  # Merge annotations and statement meta-data
  g <- left_join(statements, annotations, by = "statement_id")

  # Merge speaker partisanship data----
  # Governors
  if (verbose) message("=== MERGE GOVERNORS AND STATEMENTS ===")
  ## Check for unmatched governors
  check.gov <- anti_join(g, govparty, by = c("gov_last" = "gov_last", "state" = "state_abb")) %>%
    select(state, gov_last) %>%
    filter(state != "Not announced")
  if (nrow(check.gov) > 0) {
    stop("Some governors were not matched to partisanship data. Please check the data.")
    print(check.gov)
  }
  # Merge governors with statements
  g <- left_join(g, govparty, by = c("gov_last" = "gov_last", "state" = "state_abb"))

  # Representatives
  if (verbose) message("=== MERGE REPRESENTATIVES AND STATEMENTS ===")
  check.rep <- g %>%
    filter(is.na(district_clean)) %>%
    select(state, representative_last, district, representative_note) %>%
    distinct() %>%
    filter(!district %in% c("Not announced", "Vacant seat")) 
  if (nrow(check.rep) > 0) {
    warning("Some districts were not matched to partisanship data. Please check the data.")
    print(check.rep)
  }

  g <- left_join(g, house_df, 
      by = c(
          "state" = "state", 
          "representative_last" = "last_name", 
          "district_clean" = "district"
      )
  )

  # Senators
  if (verbose) message("=== MERGE SENATORS AND STATEMENTS ===")
  g <- g %>%
      left_join(senators_df, by = c("state", "senator_1_last" = "last_name")) %>%
      rename(senator_1_party = sen_party) %>%
      left_join(senators_df, by = c("state", "senator_2_last" = "last_name")) %>%
      rename(senator_2_party = sen_party)

check.sen <- g %>%
  filter(is.na(senator_1_party) | is.na(senator_2_party)) %>%
  select(state, senator_1_last, senator_2_last, senator_1_note, senator_2_note) %>%
  filter(state != "Not announced")
if (nrow(check.sen) > 0) {
  warning("Some senators were not matched to partisanship data. Please check the data.")
  print(check.sen)
}

# Congressional elections----
message("=== MERGE CONGRESSIONAL ELECTIONS AND STATEMENTS ===")
g <- g %>%
  left_join(cong_elec, by = c("election_year", "state", "district_num" = "district")) %>%
  relocate(c(date, cong_district_election_id, election_year), .before = state)

check.cong <- g %>%
  drop_na(district_num) %>%
  filter(is.na(cong_district_election_id)) %>%
  distinct(election_year, state, district_num) %>%
  rename(year = election_year) %>%
  mutate(has_missing_election_data = 1)
# And is not one of the districts in the middle of re-district
check.cong <- check.cong %>%
  filter(!(district_num == 8 & state == "CO") & !(district_num == 14 & state == "NC"))
if (nrow(check.cong) > 0) {
  warning("Some congressional districts have missing election data. Please check the data.")
  print(check.cong)
}

# BGM Turner project metadata----
message("=== MERGE TURNER PROJECTS AND STATEMENTS ===")
# Drop sector since Turner data will have more specific sector info
g <- subset(g, select = -c(sector))
# Inspect projects with missing FIPS
turner %>%
  filter(is.na(fips)) %>%
  distinct() %>%
  pull(operating_status) %>%
  table()
# Get relevant project metadata
turner.sub <- turner %>%
  select(site_id, operating_status, sector, mfg_activity,
    mfg_product, invest_m:current_jobs, capital_bin:operating, project_announcement_date)  

check.turner <- anti_join(g, turner.sub, by = c("id" = "site_id")) %>%
  select(id, project, date) %>%
  distinct()
if (nrow(check.turner) > 0) {
  print(check.turner)
  stop("Some statements were not matched to Turner data. Please check the data.")
}

g.out <- left_join(g, turner.sub, by = c("id" = "site_id"))

# Check for projects in turner but not annotated
in.turner.but.no.statement <- anti_join(turner.sub, g, by = c("site_id" = "id")) %>%
  filter(year(project_announcement_date) < 2025 & !is.na(project_announcement_date)) %>%
  select(site_id, operating_status, sector, project_announcement_date) %>%
  distinct()
if (nrow(in.turner.but.no.statement) > 0) {
  print(in.turner.but.no.statement)
  stop("Some projects were not matched to statements data. Please check the data.")
}

# Check correspondence between project announcement date and statement date
g.out %>%
  mutate(dif = date - project_announcement_date) %>%
  filter(abs(dif) > 0) %>%
  select(id, date, project_announcement_date, dif) %>%
  distinct()

# Filter based on Turner project announcement date
g.out <- g.out %>%
  filter(project_announcement_date > ira_date & year(project_announcement_date) <= project_year_cutoff)

# Use the project announcement date from turner rather than the statement date
g.out <- g.out %>%
  mutate(date = project_announcement_date) %>%
  select(-project_announcement_date)

# Process covariates
g.out$year <- lubridate::year(g.out$date)

# ACS data----
message("=== MERGE ACS AND STATEMENTS ===")
check.acs <- anti_join(g.out, acs, by = "fips")
check.acs <- check.acs %>%
  filter(!state %in% c("Not announced") & !district %in% c("Not announced") & !is.na(fips)) %>%
  dplyr::select(fips, state, city, district, id)
if (nrow(check.acs) > 0) {
  warning("Some statements with valid location aren't matched to ACS data. Please check the data.")
  print(check.acs)
}

g.out <- left_join(g.out, acs, by = "fips")

# Interstate data----
message("=== MERGE INTERSTATE AND STATEMENTS ===")
check.interstate <- anti_join(g.out, interstate, by = c("fips" = "fips.pre"))
check.interstate <- check.interstate %>%
  filter(!state %in% c("Not announced") & !district %in% c("Not announced") & !is.na(fips)) %>%
  dplyr::select(fips, state, city, district) %>%
  distinct()
if (nrow(check.interstate) > 0) {
  warning("Some statements with valid location aren't matched to Interstate data. Please check the data.")
  print(check.interstate)
}

g.out <- left_join(g.out, interstate, by = c("fips" = "fips.pre"))

# Broadband data----
message("=== MERGE BROADBAND AND STATEMENTS ===")
broadband$bb100 <- relevel(factor(broadband$bb100), ref = "0-400")
# Lag broadband data by one year
check.broadband <- anti_join(g.out, broadband, by = c("fips", "year" = "year_lag"))
check.broadband <- check.broadband %>%
  filter(!state %in% c("Not announced") & !district %in% c("Not announced") & !is.na(fips)) %>%
  dplyr::select(fips, state, city, year) %>%
  distinct()
if (nrow(check.broadband) > 0) {
  warning("Some statements aren't matched to Broadband data. Please check the data.")
  print(check.broadband)
}

broadband <- subset(broadband, select = -c(year, fips.post))

g.out <- left_join(g.out, broadband, by = c("fips" = "fips", "year" = "year_lag"))

# BEA data----
message("=== MERGE BEA AND STATEMENTS ===")
check.bea <- anti_join(g.out, bea, by = c("fips" = "fips.census", "year" = "year_lag"))
check.bea <- check.bea %>%
  filter(!state %in% c("Not announced") & !district %in% c("Not announced")) %>%
  dplyr::select(fips, state, city, year) %>%
  distinct()
if (nrow(check.bea) > 0) {
  warning("Some statements aren't matched to BEA data. Please check the data.")
  print(check.bea)
}

bea <- filter(bea, !is.na(fips.census))
bea <- subset(bea, select = -c(year, fips.bea))

g.out <- left_join(g.out, bea, by = c("fips" = "fips.census", "year" = "year_lag"))

# Presidential election data----
message("=== MERGE PRESIDENTIAL ELECTIONS AND STATEMENTS ===")
check.pres <- anti_join(g.out, pres, by = c("fips" = "fips"))
check.pres <- check.pres %>%
  filter(!state %in% c("Not announced") & !district %in% c("Not announced") & !is.na(fips)) %>%
  dplyr::select(fips, state, city)
if (nrow(check.pres) > 0) {
  warning("Some statements aren't matched to presidential data. Please check the data.")
  print(check.pres)
}

g.out <- left_join(g.out, pres, by = c("fips" = "fips"))

# Electricity prices----
message("=== MERGE ELECTRICITY PRICES AND STATEMENTS ===")
check.eprice <- anti_join(g.out, eprice, by = c("state" = "state.abb", "year" = "year_lag"))
check.eprice <- check.eprice %>%
  filter(!state %in% c("Not announced") & !district %in% c("Not announced")) %>%
  dplyr::select(state, year) %>%
  distinct()
if (nrow(check.eprice) > 0) {
  warning("Some statements aren't matched to electricity prices data. Please check the data.")
  print(check.eprice)
}

eprice <- subset(eprice, select = -c(state, year))

g.out <- left_join(g.out, eprice, by = c("state" = "state.abb", "year" = "year_lag"))

# Unemployment data----
message("=== MERGE UNEMPLOYMENT AND STATEMENTS ===")
check.emp <- anti_join(g.out, emp, by = c("fips" = "fips", "year" = "year"))
check.emp <- check.emp %>%
  filter(!state %in% c("Not announced") & !district %in% c("Not announced") & !is.na(fips)) %>%
  dplyr::select(fips, state, city, year) %>%
  distinct()
if (nrow(check.emp) > 0) {
  warning("Some statements aren't matched to unemployment data. Please check the data.")
  print(check.emp)
}

emp <- subset(emp, select = -c(year, fips.post))

g.out <- left_join(g.out, emp, by = c("fips", "year" = "year_lag"))

# Union data----
message("=== MERGE UNION AND STATEMENTS ===")
check.union <- anti_join(g.out, union, by = c("state" = "state.abb", "year" = "year_lag"))
check.union <- check.union %>%
  filter(!state %in% c("Not announced") & !district %in% c("Not announced")) %>%
  dplyr::select(state, year) %>%
  distinct()
if (nrow(check.union) > 0) {
  warning("Some statements aren't matched to union data. Please check the data.")
  print(check.union)
}

union <- subset(union, select = -c(year, state))

g.out <- left_join(g.out, union, by = c("state" = "state.abb", "year" = "year_lag"))

# Process covariates----
message("=== PROCESS COVARIATES ===")

# Gave statement
g.out$gave_statement <- as.integer(g.out$statement != "" & !is.na(g.out$statement))

# Biden or IRA credit indicator
g.out$credit_biden_ira <- as.integer(g.out$credit_biden + g.out$credit_ira > 0)

# Swing state indicator
g.out$swingstate <- as.integer(g.out$state %in% c("AZ", "GA", "MI", "NV", "PA", "WI"))
g.out$swingstate <- replace_na(g.out$swingstate, 0)

# Speaker party
g.out <- g.out %>%
    mutate(
        party = case_when(
            actor == "senator_1_text" ~ senator_1_party,
            actor == "senator_2_text" ~ senator_2_party,
            actor == "governor_text" ~ gov_party,
            actor == "representative_text" ~ rep_party
        )
    )

# Project status
g.out <- g.out %>%
  mutate(
    status = case_when(
      operating_status %in% c("Cancelled", "Closed", "Paused", "Sold", "Rumored") ~ "Cancelled/Closed/Paused/Sold/Rumored",
      operating_status %in% c("Operating", "Operating Partially; Under Construction") ~ "Operating",
      operating_status %in% c("Pilot", "Planned", "Under Construction") ~ "Pilot/Planned/Construction"
    ),
    status = factor(status, levels = c("Cancelled/Closed/Paused/Sold/Rumored", "Operating", "Pilot/Planned/Construction"))
  )
g.out$party_label <- ifelse(g.out$party == "D", "Democrat", "Republican")

# Manufacturing indicator
g.out$manufacturing_bin <- ifelse(g.out$mfg_activity == "Manufacturing", 1, 0)

# Capital investment bin
g.out$capital_bin_label <- ifelse(g.out$capital_bin == 1, "Investment amount specified", "Not specified")

# Quarters
g.out$quarter <- paste0(lubridate::year(g.out$date), "-Q", lubridate::quarter(g.out$date))

  # Standardize continuous covariates
  covars <- c(
    "college_acs", "poverty_acs", "foreign_acs", "housing_acs",
    "gdp_ln", "urate", "demshare_major", "eprice", "union_mem", "force_ln",
    "income_pc"
  )

  zfun <- function(x) {
    m <- mean(x, na.rm = TRUE)
    s <- sd(x,  na.rm = TRUE)
    if (is.na(s) || s == 0) return(rep(NA_real_, length(x)))
    (x - m) / s
  }

  g_z <- g.out %>%
    distinct(fips, .keep_all = TRUE) %>%
    mutate(across(all_of(covars), zfun, .names = "{.col}_z")) %>%
    select(fips, ends_with("_z"))

  g.out <- g.out %>%
    left_join(g_z, by = "fips")

  # Final Diagnostics----

  ## Summary statistics for key variables
  if (verbose) message("=== FINAL DIAGNOSTICS ===")

# Sample size and coverage
  if (verbose) message(sprintf("Final dataset contains %d observations", nrow(g.out)))
  if (verbose) message(sprintf("Unique projects: %d", n_distinct(g.out$id)))
  if (verbose) message(sprintf("Date range: %s to %s", min(g.out$date, na.rm = TRUE), max(g.out$date, na.rm = TRUE)))

# Statement coverage by actor type
statement_coverage <- g.out %>%
  group_by(actor) %>%
  summarise(
    total = n(),
    gave_statement = sum(gave_statement, na.rm = TRUE),
    pct_gave_statement = round(100 * gave_statement / total, 1),
    .groups = "drop"
  )
message("Statement coverage by actor type:")
print(statement_coverage)

# Party coverage
party_coverage <- g.out %>%
  filter(!is.na(party) & !actor %in% c("biden_text", "company_text")) %>%
  count(party, name = "count") %>%
  mutate(pct = round(100 * count / sum(count), 1))
message("Party distribution:")
print(party_coverage)

# Missing data patterns for key covariates
missing_patterns <- g.out %>%
  summarise(
    fips_missing = sum(is.na(fips) & state != "Not announced"),
    party_missing = sum(is.na(party) & !actor %in% c("biden_text", "company_text") & state != "Not announced"),
    broadband_missing = sum(is.na(bb100) & state != "Not announced"),
    bea_gdp_missing = sum(is.na(gdp) & state != "Not announced" & district != "Not announced"),
    electricity_missing = sum(is.na(eprice) & state != "Not announced"),
    union_missing = sum(is.na(union_mem) & state != "Not announced"),
    unemployment_missing = sum(is.na(urate) & state != "Not announced" & district != "Not announced")
  ) %>%
  pivot_longer(everything(), names_to = "variable", values_to = "missing_count") %>%
  mutate(pct_missing = round(100 * missing_count / nrow(g.out), 1))
message("Missing data patterns:")
print(missing_patterns)

# Sector distribution
if ("sector" %in% names(g.out)) {
  sector_dist <- g.out %>%
    count(sector, sort = TRUE) %>%
    mutate(pct = round(100 * n / sum(n), 1))
  message("Sector distribution:")
  print(sector_dist)
}

# Geographic coverage
geo_coverage <- g.out %>%
  group_by(state) %>%
  summarise(
    projects = n_distinct(id),
    statements = n(),
    .groups = "drop"
  ) %>%
  arrange(desc(projects))
message(sprintf("Geographic coverage: %d states/territories", nrow(geo_coverage)))
message("Top 10 states by project count:")
print(head(geo_coverage, 10))

# Temporal distribution
temporal_dist <- g.out %>%
  count(year, name = "statements") %>%
  arrange(year)
message("Temporal distribution:")
print(temporal_dist)

# Check for duplicates
duplicates <- g.out %>%
  group_by(id, actor) %>%
  filter(n() > 1) %>%
  ungroup()
if (nrow(duplicates) > 0) {
  warning(sprintf("Found %d duplicate project-actor combinations", nrow(duplicates)))
  print(duplicates %>% select(id, actor, project, state))
} else {
  message("No duplicate project-actor combinations found")
}

  # Save output----
  write_csv(g.out, output_file)
  if (verbose) message(sprintf("Saved %s", output_file))
  
  # Return the merged dataset
  return(g.out)
}

# Script execution when sourced directly----
# To skip automatic execution, set SKIP_MERGE_EXECUTION <- TRUE before sourcing
if (!exists("SKIP_MERGE_EXECUTION")) {
  # Default execution - use primary annotations
  message("=== RUNNING MERGE WITH DEFAULT SETTINGS ===")
  message("To use function manually, set SKIP_MERGE_EXECUTION <- TRUE before sourcing")
  message("Example usage:")
  message('df <- merge_statements_covariates(')
  message('  annotations_file = "data/inter/annotated_statements_robust.csv",')
  message('  output_file = "data/output/statements_analysis_robust.csv"')
  message(')')
  
  df <- merge_statements_covariates()
  message(sprintf("Primary merge complete: %d observations", nrow(df)))
}